Take-home Exercise 1

Author

Chang Fang Yu

Modified

February 11, 2025

  1. Overview

  2. Getting Started
    Setting the Analytical Tools**

pacman::p_load(tidyverse, haven, knitr,
               patchwork, ggthemes, scales,
               ggridges, ggpubr, gganimate, ggraph, plotly,
               ggdist, ggtext, ggalt,
               cowplot, ggnewscale, ggmosaic, hrbrthemes, DT,)
**Data Sources**
  1. Data Pre-processing

    Importing Data

JHA <- read_csv("data/japan_heart_attack_dataset.csv")
glimpse(JHA)
Rows: 30,000
Columns: 32
$ Age                     <dbl> 56, 69, 46, 32, 60, 25, 78, 38, 56, 75, 36, 40…
$ Gender                  <chr> "Male", "Male", "Male", "Female", "Female", "F…
$ Region                  <chr> "Urban", "Urban", "Rural", "Urban", "Rural", "…
$ Smoking_History         <chr> "Yes", "No", "Yes", "No", "No", "No", "No", "Y…
$ Diabetes_History        <chr> "No", "No", "No", "No", "No", "No", "Yes", "No…
$ Hypertension_History    <chr> "No", "No", "No", "No", "No", "No", "Yes", "No…
$ Cholesterol_Level       <dbl> 186.4002, 185.1367, 210.6966, 211.1655, 223.81…
$ Physical_Activity       <chr> "Moderate", "Low", "Low", "Moderate", "High", …
$ Diet_Quality            <chr> "Poor", "Good", "Average", "Good", "Good", "Go…
$ Alcohol_Consumption     <chr> "Low", "Low", "Moderate", "High", "High", "Hig…
$ Stress_Levels           <dbl> 3.644786, 3.384056, 3.810911, 6.014878, 6.8068…
$ BMI                     <dbl> 33.96135, 28.24287, 27.60121, 23.71729, 19.771…
$ Heart_Rate              <dbl> 72.30153, 57.45764, 64.65870, 55.13147, 76.667…
$ Systolic_BP             <dbl> 123.90209, 129.89331, 145.65490, 131.78522, 10…
$ Diastolic_BP            <dbl> 85.68281, 73.52426, 71.99481, 68.21133, 92.902…
$ Family_History          <chr> "No", "Yes", "No", "No", "No", "No", "No", "No…
$ Heart_Attack_Occurrence <chr> "No", "No", "No", "No", "No", "No", "No", "No"…
$ Extra_Column_1          <dbl> 0.40498852, 0.03627815, 0.85297888, 0.39085280…
$ Extra_Column_2          <dbl> 0.43330004, 0.51256694, 0.21959083, 0.29684675…
$ Extra_Column_3          <dbl> 0.62871236, 0.66839275, 0.61343656, 0.15572404…
$ Extra_Column_4          <dbl> 0.70160955, 0.11552874, 0.50800995, 0.87025144…
$ Extra_Column_5          <dbl> 0.49814235, 0.42381938, 0.90066981, 0.39035591…
$ Extra_Column_6          <dbl> 0.007901312, 0.083932768, 0.227205241, 0.40318…
$ Extra_Column_7          <dbl> 0.79458257, 0.68895108, 0.49634358, 0.74140891…
$ Extra_Column_8          <dbl> 0.29077922, 0.83016364, 0.75210679, 0.22396813…
$ Extra_Column_9          <dbl> 0.49719307, 0.63449028, 0.18150125, 0.32931387…
$ Extra_Column_10         <dbl> 0.52199452, 0.30204337, 0.62918031, 0.14319054…
$ Extra_Column_11         <dbl> 0.79965663, 0.04368285, 0.01827617, 0.90778075…
$ Extra_Column_12         <dbl> 0.72239788, 0.45166789, 0.06322702, 0.54232201…
$ Extra_Column_13         <dbl> 0.1487387, 0.8786714, 0.1465122, 0.9224606, 0.…
$ Extra_Column_14         <dbl> 0.8340099, 0.5356022, 0.9972962, 0.6262165, 0.…
$ Extra_Column_15         <dbl> 0.061632229, 0.617825340, 0.974455410, 0.22860…
DT::datatable(JHA, class= "compact")

Checking for Duplicates and Missing Values

JHA[duplicated(JHA),]
# A tibble: 0 × 32
# ℹ 32 variables: Age <dbl>, Gender <chr>, Region <chr>, Smoking_History <chr>,
#   Diabetes_History <chr>, Hypertension_History <chr>,
#   Cholesterol_Level <dbl>, Physical_Activity <chr>, Diet_Quality <chr>,
#   Alcohol_Consumption <chr>, Stress_Levels <dbl>, BMI <dbl>,
#   Heart_Rate <dbl>, Systolic_BP <dbl>, Diastolic_BP <dbl>,
#   Family_History <chr>, Heart_Attack_Occurrence <chr>, Extra_Column_1 <dbl>,
#   Extra_Column_2 <dbl>, Extra_Column_3 <dbl>, Extra_Column_4 <dbl>, …
sum(is.na(JHA))
[1] 0
  1. EDA
numeric_vars <- c("Age", "Cholesterol_Level", "BMI", "Heart_Rate", "Systolic_BP", "Diastolic_BP", "Stress_Levels")


JHA %>% 
  pivot_longer(cols = all_of(numeric_vars), names_to = "Variable", values_to = "Value") %>%
  ggplot(aes(x = Value, fill = Variable)) +
  geom_histogram(alpha = 0.5, bins = 30, color = "black") +
  facet_wrap(~Variable, scales = "free") +
  theme_minimal() +
  labs(title = "Histogram of Numeric Variables")

categorical_vars <- c("Gender", "Region", "Smoking_History", "Diabetes_History", 
                      "Hypertension_History", "Physical_Activity", "Diet_Quality", 
                      "Alcohol_Consumption", "Family_History", "Heart_Attack_Occurrence")

JHA %>% 
  pivot_longer(cols = all_of(categorical_vars), names_to = "Variable", values_to = "Value") %>%
  ggplot(aes(x = Value, fill = Variable)) +
  geom_bar() +
  facet_wrap(~Variable, scales = "free_x", ncol = 3) +  
  theme_minimal() +
  labs(title = "Distribution of Categorical Variables") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(data=JHA, 
       aes(y = BMI, 
           x= Gender)) +
  geom_boxplot() +
  stat_summary(geom = "point",       
               fun = "mean",         
               colour ="red",        
               size=4)    

numerical_vars <- c("Age", "Cholesterol_Level", "BMI", "Heart_Rate", 
                    "Systolic_BP", "Diastolic_BP", "Stress_Levels")
JHA %>%
  pivot_longer(cols = all_of(numerical_vars), names_to = "Variable", values_to = "Value") %>%
  ggplot(aes(x = Variable, y = Value, fill = Variable)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Boxplot of Numerical Variables",
       x = "Variable", y = "Value") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(JHA, aes(x = Heart_Attack_Occurrence, y = Cholesterol_Level, fill = Heart_Attack_Occurrence)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Cholesterol Level by Heart Attack Occurrence",
       x = "Heart Attack Occurrence", y = "Cholesterol Level")

ggplot(JHA, aes(x = Smoking_History, fill = Heart_Attack_Occurrence)) +
  geom_bar(position = "fill") +
  theme_minimal() +
  labs(title = "Proportion of Heart Attack Occurrence by Smoking History",
       x = "Smoking History", y = "Proportion")

d <- highlight_key(JHA) 
head(JHA, 500)
# A tibble: 500 × 32
     Age Gender Region Smoking_History Diabetes_History Hypertension_History
   <dbl> <chr>  <chr>  <chr>           <chr>            <chr>               
 1    56 Male   Urban  Yes             No               No                  
 2    69 Male   Urban  No              No               No                  
 3    46 Male   Rural  Yes             No               No                  
 4    32 Female Urban  No              No               No                  
 5    60 Female Rural  No              No               No                  
 6    25 Female Rural  No              No               No                  
 7    78 Male   Urban  No              Yes              Yes                 
 8    38 Female Urban  Yes             No               No                  
 9    56 Male   Rural  No              No               Yes                 
10    75 Male   Urban  No              No               No                  
# ℹ 490 more rows
# ℹ 26 more variables: Cholesterol_Level <dbl>, Physical_Activity <chr>,
#   Diet_Quality <chr>, Alcohol_Consumption <chr>, Stress_Levels <dbl>,
#   BMI <dbl>, Heart_Rate <dbl>, Systolic_BP <dbl>, Diastolic_BP <dbl>,
#   Family_History <chr>, Heart_Attack_Occurrence <chr>, Extra_Column_1 <dbl>,
#   Extra_Column_2 <dbl>, Extra_Column_3 <dbl>, Extra_Column_4 <dbl>,
#   Extra_Column_5 <dbl>, Extra_Column_6 <dbl>, Extra_Column_7 <dbl>, …
p <- ggplot(d, 
            aes(BMI, 
                Cholesterol_Level)) + 
  geom_point(size=1) +
  coord_cartesian(xlim=c(0,100),
                  ylim=c(0,100))

gg <- highlight(ggplotly(p),        
                "plotly_selected")  

crosstalk::bscols(gg,               
                  DT::datatable(d), 
                  widths = 5)
ggplot(JHA, aes(x = BMI, y = Systolic_BP, color = Gender)) +
  geom_point(alpha = 0.5) +
  theme_minimal() +
  labs(title = "Scatter Plot of BMI vs Systolic BP")

ggplot(JHA, aes(x = Smoking_History, fill = Hypertension_History)) +
  geom_bar(position = "fill") +
  theme_minimal() +
  labs(title = "Proportion of Hypertension by Smoking History",
       y = "Proportion")

library(GGally)

JHA %>% 
  select(all_of(numeric_vars)) %>%
  ggpairs()

ggplot(data=JHA, 
       aes(y = Cholesterol_Level, 
           x= Gender)) +
  geom_violin()

ggplot(data=JHA, 
       aes(y = BMI, 
           x= Gender)) +
  geom_violin()

ggplot(data=JHA, 
       aes(y = Systolic_BP , 
           x= Region)) +
  geom_violin()

ggplot(data=JHA, 
       aes(y = BMI, 
           x= Region)) +
  geom_violin()

  1. Summary and Conclusion

  2. References